{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'paddlehub'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-9-9ddacf036f2f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mcollections\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCounter\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mpaddlehub\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mhub\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpaddle\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_selection\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'paddlehub'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "import paddlehub as hub\n",
    "import paddle\n",
    "from sklearn.model_selection import train_test_split\n",
    "from paddlehub.datasets.base_nlp_dataset import TextClassificationDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>瞧着这小样儿，突然间感动了，爸妈怎么把我拉扯大的呀～～～</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>习惯和凑和的力量何其强大，革命总是被逼到无法接受的程度以后才会发生，这时仍要忍受数小时的漫长...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>5.尽量在7点前起床，这样有利于排出宿便；</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>原来不知道在何时开始，我已经不再是儿童了，不再是那个可以撒娇的小孩子了！</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>宝贝，节日快乐。</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>我们还要这样的阳光吗？</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>据说济南最低温度24度，可今天青岛最高才21度……</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>据说济南最低温度24度，可今天青岛最高才21度……</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>这才是你的舞台啊！！！</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>享受每一刻的感觉，欣赏每一处的风景，这就是人生。</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   num                                               text  label\n",
       "0    1                       瞧着这小样儿，突然间感动了，爸妈怎么把我拉扯大的呀～～～    3.0\n",
       "1    2  习惯和凑和的力量何其强大，革命总是被逼到无法接受的程度以后才会发生，这时仍要忍受数小时的漫长...    0.0\n",
       "2    3                              5.尽量在7点前起床，这样有利于排出宿便；    0.0\n",
       "3    4               原来不知道在何时开始，我已经不再是儿童了，不再是那个可以撒娇的小孩子了！    2.0\n",
       "4    5                                           宝贝，节日快乐。    1.0\n",
       "5    6                                        我们还要这样的阳光吗？    5.0\n",
       "6    7                          据说济南最低温度24度，可今天青岛最高才21度……    6.0\n",
       "7    8                          据说济南最低温度24度，可今天青岛最高才21度……    6.0\n",
       "8    9                                        这才是你的舞台啊！！！    5.0\n",
       "9   10                           享受每一刻的感觉，欣赏每一处的风景，这就是人生。    0.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel( 'C:/Users/Administrator/Desktop/moods_classify8_unprocessed.xlsx')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-15-a74c58233b9e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36minfo\u001b[1;34m(self, verbose, buf, max_cols, memory_usage, null_counts)\u001b[0m\n\u001b[0;32m   2272\u001b[0m                         self.index._is_memory_usage_qualified()):\n\u001b[0;32m   2273\u001b[0m                     \u001b[0msize_qualifier\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'+'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2274\u001b[1;33m             \u001b[0mmem_usage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmemory_usage\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdeep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdeep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2275\u001b[0m             lines.append(\"memory usage: {mem}\\n\".format(\n\u001b[0;32m   2276\u001b[0m                 mem=_sizeof_fmt(mem_usage, size_qualifier)))\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mmemory_usage\u001b[1;34m(self, index, deep)\u001b[0m\n\u001b[0;32m   2366\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2367\u001b[0m             result = Series(self.index.memory_usage(deep=deep),\n\u001b[1;32m-> 2368\u001b[1;33m                             index=['Index']).append(result)\n\u001b[0m\u001b[0;32m   2369\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2370\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, dtype, name, copy, fastpath)\u001b[0m\n\u001b[0;32m    273\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    274\u001b[0m                 data = _sanitize_array(data, index, dtype, copy,\n\u001b[1;32m--> 275\u001b[1;33m                                        raise_cast_failure=True)\n\u001b[0m\u001b[0;32m    276\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    277\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSingleBlockManager\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfastpath\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py\u001b[0m in \u001b[0;36m_sanitize_array\u001b[1;34m(data, index, dtype, copy, raise_cast_failure)\u001b[0m\n\u001b[0;32m   4147\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4148\u001b[0m             subarr = construct_1d_arraylike_from_scalar(\n\u001b[1;32m-> 4149\u001b[1;33m                 value, len(index), dtype)\n\u001b[0m\u001b[0;32m   4150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4151\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py\u001b[0m in \u001b[0;36mconstruct_1d_arraylike_from_scalar\u001b[1;34m(value, length, dtype)\u001b[0m\n\u001b[0;32m   1199\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mis_integer_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1200\u001b[0m             \u001b[0mdtype\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1201\u001b[1;33m         \u001b[0msubarr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1202\u001b[0m         \u001b[0msubarr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfill\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1203\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "num      False\n",
       "text      True\n",
       "label     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>93</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>367</th>\n",
       "      <td>368</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1101</th>\n",
       "      <td>1102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112</th>\n",
       "      <td>1113</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1161</th>\n",
       "      <td>1162</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1208</th>\n",
       "      <td>1209</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1249</th>\n",
       "      <td>1250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1694</th>\n",
       "      <td>1695</td>\n",
       "      <td>我想说中国皇帝有上千个老婆呢。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>1706</td>\n",
       "      <td>2、总是和别人比较。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2141</th>\n",
       "      <td>2142</td>\n",
       "      <td>在最艰难的时刻，更要相信自己手中握有最好的猎枪。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>2189</td>\n",
       "      <td>今日话题，走着：全国高考今日拉开帷幕，各地高考作文题陆续公布。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2278</th>\n",
       "      <td>2279</td>\n",
       "      <td>今天六一节，感觉没有什么不同的，很多朋友互相问候节日快乐！</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2360</th>\n",
       "      <td>2361</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2361</th>\n",
       "      <td>2362</td>\n",
       "      <td>禁买令最大的问题在于地域性歧视方面，一是户籍问题，而是限家奴而不限友邦。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2362</th>\n",
       "      <td>2363</td>\n",
       "      <td>蔡健雅唱过，得不到的就更加爱，太容易来的就不理睬。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4699</th>\n",
       "      <td>4700</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4700</th>\n",
       "      <td>4701</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4841</th>\n",
       "      <td>4842</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4855</th>\n",
       "      <td>4856</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4910</th>\n",
       "      <td>4911</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4943</th>\n",
       "      <td>4944</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5005</th>\n",
       "      <td>5006</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5069</th>\n",
       "      <td>5070</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>10871</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10923</th>\n",
       "      <td>10924</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11108</th>\n",
       "      <td>11109</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11672</th>\n",
       "      <td>11673</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12324</th>\n",
       "      <td>12325</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13252</th>\n",
       "      <td>13253</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14592</th>\n",
       "      <td>14593</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20281</th>\n",
       "      <td>20282</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20380</th>\n",
       "      <td>20381</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20478</th>\n",
       "      <td>20479</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20569</th>\n",
       "      <td>20570</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23780</th>\n",
       "      <td>23781</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         num                                  text  label\n",
       "17        18                                   NaN    0.0\n",
       "47        48                                   NaN    0.0\n",
       "92        93                                   NaN    0.0\n",
       "367      368                                   NaN    0.0\n",
       "1101    1102                                   NaN    0.0\n",
       "1112    1113                                   NaN    0.0\n",
       "1161    1162                                   NaN    0.0\n",
       "1208    1209                                   NaN    0.0\n",
       "1249    1250                                   NaN    0.0\n",
       "1694    1695                       我想说中国皇帝有上千个老婆呢。    NaN\n",
       "1705    1706                            2、总是和别人比较。    NaN\n",
       "2141    2142              在最艰难的时刻，更要相信自己手中握有最好的猎枪。    NaN\n",
       "2188    2189       今日话题，走着：全国高考今日拉开帷幕，各地高考作文题陆续公布。    NaN\n",
       "2278    2279         今天六一节，感觉没有什么不同的，很多朋友互相问候节日快乐！    NaN\n",
       "2360    2361                                   NaN    0.0\n",
       "2361    2362  禁买令最大的问题在于地域性歧视方面，一是户籍问题，而是限家奴而不限友邦。    NaN\n",
       "2362    2363             蔡健雅唱过，得不到的就更加爱，太容易来的就不理睬。    NaN\n",
       "4699    4700                                   NaN    0.0\n",
       "4700    4701                                   NaN    0.0\n",
       "4841    4842                                   NaN    0.0\n",
       "4855    4856                                   NaN    0.0\n",
       "4910    4911                                   NaN    0.0\n",
       "4943    4944                                   NaN    0.0\n",
       "5005    5006                                   NaN    0.0\n",
       "5069    5070                                   NaN    0.0\n",
       "10870  10871                                   NaN    0.0\n",
       "10923  10924                                   NaN    1.0\n",
       "11108  11109                                   NaN    5.0\n",
       "11672  11673                                   NaN    0.0\n",
       "12324  12325                                   NaN    0.0\n",
       "13252  13253                                   NaN    5.0\n",
       "14592  14593                                   NaN    5.0\n",
       "20281  20282                                   NaN    0.0\n",
       "20380  20381                                   NaN    0.0\n",
       "20478  20479                                   NaN    0.0\n",
       "20569  20570                                   NaN    0.0\n",
       "23780  23781                                   NaN    5.0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.isnull().values==True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-18-2bb18d92f54f>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-18-2bb18d92f54f>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m    df.dropna(subset = ['text' , 'label'].axis = 0,how = 'any' ,inplace = True)\u001b[0m\n\u001b[1;37m                                               ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "df.dropna(subset = ['text' , 'label'].axis = 0,how = 'any' ,inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "num      False\n",
       "text      True\n",
       "label     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.boxplot(x = df.label,\n",
    "           whis = 1.5,\n",
    "           width = 0.8,\n",
    "           patch_artist = True,\n",
    "           showmeans = True,\n",
    "           boxprops = { 'facecolor' ; 'steelblue'})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
